// tables.do

clear all
capture log close	
ssc install reghdfe
ssc install ivreghdfe

cd "projectfolder"

use "analysis_data_organized.dta", clear

qui reg EA_new firstborn ea_new_ukb_ld ea_new_23me_ukb_ld rank family_size last_child sex
keep if e(sample)

egen N_famid_new=count(famid) if famid!=., by(famid)
tab N_famid_new
drop if N_famid_new==1
count

// Standardize the polygenic scores 
global pgs_new ea_new_ukb_ld ea_new_ukb_ld_0 ea_new_ukb_ld_1 ea_new_23me_ukb_ld ea_23me_ld
foreach i in $pgs_new {
	qui sum `i', detail
	replace `i' = (`i' - r(mean))/r(sd)
}
 
// 14,850

* Testing specification of Booth & Kee (2009)
gen higher_edu=(EA_new==20)
probit higher_edu rank sex i.YoB i.MoB e_PC* family_size_imp
margins, dydx(rank)
	   

********************************************************************************
* APPENDIX C
* TABLE C.2. RESULTS OF REGRESSIONS OF YEARS OF EDUCATION ON THE POLYGENIC SCORE
******************************************************************************** 
xtset famid ID
global controls sex i.YoB i.MoB e_PC* 

// without family fixed effects  
qui eststo M1: reg   EA_new $controls if famid!=., robust
qui eststo M2: reg   EA_new $controls ea_new_ukb_ld if famid!=., robust
// with family fixed effects 
qui eststo M3: xtreg EA_new $controls if famid!=., cluster(famid) fe
qui eststo M4: xtreg EA_new $controls ea_new_ukb_ld if famid!=., cluster(famid) fe

esttab M1 M2 M3 M4, ///
       noomitted nobase se star(* 0.10 ** 0.05 *** 0.01) ///
       stats(r2 N, fmt(3 0)) drop(e_PC_* *.MoB *.YoB sex) se(3) b(3) ///
	   title("Table C.2. Results of the regressions of years of education on the polygenic score (PGS)") ///
	   mtitles("No PGSs" "Between Siblings" "No PGSs FE" "Sibling FE") ///
	   addnotes("Notes: Robust standard errors in parentheses, clustered by family;" ///
	   "Coefficients for the control variables (year and month of birth, gender and the" ///
	   "first 40 PCs of the genetic relatedness matrix) are not displayed, but available upon request from the authors.") 
	   
est drop _all	   


*****************************************************************************
* TABLE C.3. REGRESSIONS OF YEARS OF EDUCATION ON DIFFERENT SPECIFICATIONS OF 
*            BIRTH ORDER
*****************************************************************************
xtset famid ID
global controls sex i.YoB i.MoB e_PC* family_size_imp

// without family fixed effects 
qui eststo M1: reg    EA_new firstborn $controls if N_famid!=1, vce(robust)
qui eststo M2: reg    EA_new i.rank $controls if N_famid!=1, vce(robust)
// with family fixed effects 
qui eststo M3: xtreg  EA_new firstborn $controls if N_famid!=1, cluster(famid) fe
qui eststo M4: xtreg  EA_new i.rank $controls if N_famid!=1, cluster(famid) fe

esttab M1 M2 M3 M4, ///
      noomitted nobase se star(* 0.10 ** 0.05 *** 0.01) ///
      stats(r2 N, fmt(3 0)) drop(e_PC_* *.MoB *.YoB sex family_size_imp) se(3) b(3) ///
	  title("Table C.2: Regression of EA on different specifications of birth order ") ///
	  mtitles("Between Siblings" "Between Siblings" "Between Siblings" "Sibling FE" "Sibling FE" "Sibling FE" "Sibling FE") ///
	  addnotes("Robust standard errors in parentheses, clustered by; Coefficients for the control variables (year and month of birth," ///
	  "gender and the first 40 PCs of the genetic relatedness matrix) are not displayed, but available upon request from the authors") 
est drop _all


**********************************************************************
* TABLE C.4. REGRESSIONS OF POLYGENIC SCORE FOR EDUCATIONAL ATTAINMENT 
*            ON BIRTH ORDER 
**********************************************************************
xtset famid ID
global controls sex i.YoB i.MoB e_PC* family_size_imp

// without family fixed effects 
qui eststo M1: reg    ea_new_ukb_ld firstborn $controls, vce(robust)
qui eststo M2: reg    ea_new_ukb_ld i.rank $controls, vce(robust)
// with family fixed effects 
qui eststo M3: xtreg  ea_new_ukb_ld firstborn $controls, cluster(famid) fe
qui eststo M4: xtreg  ea_new_ukb_ld i.rank $controls, cluster(famid) fe

esttab M1 M2 M3 M4, ///
      noomitted nobase se star(* 0.10 ** 0.05 *** 0.01) ///
      stats(r2 N, fmt(3 0)) drop(e_PC_* *.MoB *.YoB sex family_size_imp) se(3) b(3) ///
	  title("Table C.3: Results of the regressions explaining the PGS of Education Years") ///
	  mtitles("Between Siblings" "Between Siblings" "Sibling FE" "Sibling FE") ///
	  addnotes("Robust standard errors in parentheses, clustered by; Coefficients "///
	  "for the control variables (year and month of birth, gender and the first 40 " ///
	  "PCs of the genetic relatedness matrix)) are not displayed, but available upon request from the authors.") 

est drop _all


***********************************************************************************
* TABLE C.5. REGRESSIONS OF YEARS OF EDUCATION ON THE GENE-ENVIRONMENT INTERACTIONS;
*            ROBUSTNESS TO NON-LINEARITIES IN THE POLYGENIC SCORE 
***********************************************************************************
// generate stratified scores by quartiles and by mean
gen abovemean_pgs_new = (ea_new_ukb_ld>0)
xtile pgs = ea_new_ukb_ld, nq(4)
tab pgs, gen(pgs_new_)
gen first_pgs2_new = firstborn*pgs_new_2
gen first_pgs3_new = firstborn*pgs_new_3
gen first_pgs4_new = firstborn*pgs_new_4
gen pgs_squared_new = ea_new_ukb_ld^2
gen pgs_squared_firstborn_new =pgs_squared_new*firstborn

xtset famid ID
global controls sex i.YoB i.MoB e_PC*

qui eststo M1: xtreg  EA_new i.firstborn ea_new_ukb_ld firstborn#c.ea_new_ukb_ld $controls, cluster(famid) fe
qui eststo M2: xtreg  EA_new i.firstborn abovemean_pgs_new c.firstborn#c.abovemean_pgs_new $controls, cluster(famid) fe
qui eststo M3: xtreg  EA_new i.firstborn pgs_new_2 pgs_new_3 pgs_new_4 first_pgs2_new first_pgs3_new first_pgs4_new $controls, cluster(famid) fe
qui eststo M4: xtreg  EA_new i.firstborn ea_new_ukb_ld firstborn#c.ea_new_ukb_ld pgs_squared_new pgs_squared_firstborn_new $controls, cluster(famid) fe

esttab M1 M2 M3 M4, noomitted nobase se star(* 0.10 ** 0.05 *** 0.01) ///
	   stats(r2 N, fmt(3 0)) drop(*.MoB *.YoB e_PC_* sex) ///
	   title("Table C.4. Regressions of years of education on the gene-environment interaction; Robustness to non-linearities in the polygenic score.") ///
	   se(3) b(3) ///
	   mtitles("Main Result" "Binary PGS" "PGS in Quartiles" "Squared PGS") ///
	   addnotes("Robust standard errors in parentheses, clustered by family;" ///
	   "Coefficients for the control variables (year and month of birth, gender" ///
	   "and the first 40 PCs of the genetic relatedness matrix) are not displayed," ///
	   "but available upon request from the authors.") 

***********************************************************************************
* TABLE C.6. REGRESSIONS OF YEARS OF EDUCATION ON THE GENE-ENVIRONMENT INTERACTIONS
*            ROBUSTNESS TO NON-LINEARITIES IN THE BIRTH ORDER
***********************************************************************************
xtset famid ID
global controls sex i.YoB i.MoB e_PC*

qui eststo M1: xtreg  EA_new ea_new_ukb_ld i.firstborn firstborn#c.ea_new_ukb_ld $controls, cluster(famid) fe
qui eststo M2: xtreg  EA_new ea_new_ukb_ld i.rank i.rank#c.ea_new_ukb_ld $controls, cluster(famid) fe

esttab M1 M2, noomitted nobase se star(* 0.10 ** 0.05 *** 0.01) ///
	   stats(r2 N, fmt(3 0)) drop(*.MoB *.YoB e_PC_* sex) ///
	   title("Table C.5. Regressions of years of education on the gene-environment interaction; Robustness to non-linearities in the birth order.") ///
	   se(3) b(3) ///
	   mtitles("Main result" "Sibling FE") ///
	   addnotes("Robust standard errors in parentheses, clustered by family;" ///
	   "Coefficients for the control variables (year and month of birth, gender" /// 
	   "and the first 40 PCs of the genetic relatedness matrix) are not displayed, but available upon request.") 

est drop _all


***********************************************************************************
* TABLE C.7. REGRESSIONS OF YEARS OF EDUCATION ON THE GENE-ENVIRONMENT INTERACTIONS
*            ROBUSTNESS TO FERTILITY CHOICES 
***********************************************************************************
xtset famid ID
global controls sex i.YoB i.MoB e_PC*

qui eststo M1: xtreg  EA_new i.firstborn ea_new_ukb_ld firstborn#c.ea_new_ukb_ld $controls, cluster(famid) fe
qui eststo M2: xtreg  EA_new i.firstborn ea_new_ukb_ld firstborn#c.ea_new_ukb_ld last_child $controls, cluster(famid) fe
qui eststo M3: xtreg  EA_new i.firstborn ea_new_ukb_ld firstborn#c.ea_new_ukb_ld $controls if family_size_imp<4, cluster(famid) fe
qui eststo M4: xtreg  EA_new i.firstborn ea_new_ukb_ld firstborn#c.ea_new_ukb_ld $controls if family_size_imp==2, cluster(famid) fe

esttab M1 M2 M3 M4, noomitted nobase se star(* 0.10 ** 0.05 *** 0.01) ///
	   stats(r2 N, fmt(3 0)) drop(*.MoB *.YoB e_PC_*) title("Table C.6. Robustness to fertility choices and missing confounders.") ///
	   se(3) b(3) ///
	   mtitles("Main Results" "Child stopping rule" "Family size<4" "Family size=2") ///
	   addnotes("Column 1 replicates our main results from Table 2; ///
	   "Column 2 additionally controls for a dummy indicating whether the individual is the lastborn;" ///
	   "Column 3 restricts the sample to families with less than four siblings;" ///
	   "Columns 4 restricts the sample to families with two siblings only." ///
	   "Robust standard errors in parentheses, clustered by family;"///
	   "Coefficients for the control variables (year and month of birth,"///
	   "family size, and the first 40 PCs of the genetic relatedness matrix) are not displayed, but available upon request from the authors.") 
	   
est drop _all


**************************************************************
* APPENDIX D
* TABLE D.1. REGRESSIONS OF EARLY LIFE PARENTAL INVESTMENTS ON 
*            THE GENE-ENVIRONMENT INTERACTION
**************************************************************
xtset famid ID
global controls sex i.YoB i.MoB e_PC*

qui eststo M1: xtreg c_breastfed i.firstborn ea_new_ukb_ld firstborn#c.ea_new_ukb_ld $controls, cluster(famid) fe
qui eststo M2: xtreg c_mumsmokingbirth i.firstborn ea_new_ukb_ld firstborn#c.ea_new_ukb_ld $controls, cluster(famid) fe

// age gap between siblings in months 
xtset famid ID
sort famid YoB MoB
gen  agegapmonths = (YoB[_n+1] - YoB)*12 + (MoB[_n+1]-MoB) if famid==famid[_n+1] & (rank==rank[_n+1]-1)
hist agegapmonths
tab rank if agegapmonths!=.

qui eststo M3: xtreg agegapmonths ea_new_ukb_ld $controls family_size_imp, cluster(famid)
qui eststo M4: xtreg agegapmonths ea_new_ukb_ld $controls , cluster(famid) fe

esttab M1 M2 M3 M4, noomitted nobase se star(* 0.10 ** 0.05 *** 0.01) ///
	   stats(r2 N, fmt(3 0)) drop(*.MoB *.YoB e_PC_* sex family_size_imp) title("Table D.1. Early life parental investments") ///
	   se(3) b(3) ///
	   mtitles("Breastfed" "Mother smoked around birth" "Age gap BF" "Age gap WF") ///
	   addnotes("Robust standard errors in parentheses, clustered by family;"///
	   "Coefficients for the control variables (year and month of birth, gender," ///
	   "family size, and the first 40 PCs of the genetic relatedness matrix) are not displayed," ///
	   "but available upon request from the authors. Sample sizes vary depending on the availability" ///
	   "of early life parental investments and the number of siblings included in the analyses.")    

est drop _all

******************************************************************************
* APPENDIX E
* TABLE E.1. RESULTS OF REGRESSIONS OF YEARS OF EDUCATION ON
*            THE GENE-ENVIRONMENT INTERACTION AND ADDITIONAL CONTROL VARIABLES 
******************************************************************************
xtset famid ID
global controls sex i.YoB i.MoB e_PC*

// Column 1: main results 
qui eststo main:      xtreg EA_new i.firstborn ea_new_ukb_ld firstborn#c.ea_new_ukb_ld $controls, cluster(famid) fe

// Column 2 Robustness to Keller (2014) correction 

** OLS version 
qui eststo fe_keller: xtreg  EA_new i.firstborn ea_new_ukb_ld firstborn#c.ea_new_ukb_ld $controls ///
                      i.YoB#firstborn i.MoB#firstborn sex#firstborn c.e_PC*#firstborn ///
					  i.YoB#c.ea_new_ukb_ld i.MoB#c.ea_new_ukb_ld sex#c.ea_new_ukb_ld ///
					  c.e_PC*#c.ea_new_ukb_ld, cluster(famid) fe

// Column 3 Robustness to Keller (2014) correction ORIV 
				  
** COMPUTE THE CORRELATION BETWEEN THE SCORES TO DETERMINE THE SCALING FACTOR
reg ea_new_ukb_ld_0 ea_new_ukb_ld_1
sca corrscores = _b[ea_new_ukb_ld_1]
qui replace ea_new_ukb_ld_0 = ea_new_ukb_ld_0/sqrt(corrscores)
qui replace ea_new_ukb_ld_1 = ea_new_ukb_ld_1/sqrt(corrscores)

** Create the interaction vars 
gen firstborn_0=firstborn*ea_new_ukb_ld_0
gen firstborn_1=firstborn*ea_new_ukb_ld_1

** Run IV FE sequentially
qui eststo fe_ivukb0: xtivreg EA_new firstborn (ea_new_ukb_ld_0 firstborn_0 = ea_new_ukb_ld_1 firstborn_1) $controls, fe vce(cluster famid)
qui eststo fe_ivukb1: xtivreg EA_new firstborn (ea_new_ukb_ld_1 firstborn_1 = ea_new_ukb_ld_0 firstborn_0) $controls, fe vce(cluster famid)


tab YoB, gen(YoB_)		
tab MoB, gen(MoB_)
global YoB YoB_*
global MoB MoB_*
global PC e_PC*
			  
preserve
expand 2, generate(replicant)
egen newid = concat(famid replicant)
gen long newidreal = real(newid)
xtset newidreal
gen mainVar = ea_new_ukb_ld_0  if replicant == 0
gen mainVarint = firstborn * ea_new_ukb_ld_0  if replicant==0
replace mainVar = ea_new_ukb_ld_1 if replicant == 1
replace mainVarint = firstborn * ea_new_ukb_ld_1 if replicant==1
gen pgi_sex = sex * ea_new_ukb_ld_0  if replicant==0
replace pgi_sex = sex * ea_new_ukb_ld_1  if replicant==1

foreach YoB of varlist YoB_* {
	gen pgi_`YoB' = `YoB' * ea_new_ukb_ld_0  if replicant==0
	replace pgi_`YoB' = `YoB' * ea_new_ukb_ld_1 if replicant==1
}

foreach MoB of varlist MoB_* {
	gen pgi_`MoB' = ea_new_ukb_ld_0 * `MoB' if replicant==0
	replace pgi_`MoB' = ea_new_ukb_ld_1 * `MoB' if replicant==1
}

foreach e_PC of varlist e_PC* {
	gen pgi_`e_PC' = ea_new_ukb_ld_0 * `e_PC' if replicant==0
	replace pgi_`e_PC' = ea_new_ukb_ld_1 * `e_PC' if replicant==1
}

gen instrument = ea_new_ukb_ld_1 if replicant == 0
gen instrumentint = firstborn * ea_new_ukb_ld_1 if replicant==0
replace instrument = ea_new_ukb_ld_0 if replicant == 1
replace instrumentint = firstborn * ea_new_ukb_ld_0 if replicant==1
gen inst_pgi_sex = sex * ea_new_ukb_ld_1  if replicant==0
replace inst_pgi_sex = sex * ea_new_ukb_ld_0  if replicant==1

foreach YoB of varlist YoB_* {
	gen inst_pgi_`YoB' = `YoB' * ea_new_ukb_ld_1  if replicant==0
	replace inst_pgi_`YoB' = `YoB' * ea_new_ukb_ld_0 if replicant==1
}

foreach MoB of varlist MoB_* {
	gen inst_pgi_`MoB' = ea_new_ukb_ld_1 * `MoB' if replicant==0
	replace inst_pgi_`MoB' = ea_new_ukb_ld_0 * `MoB' if replicant==1
}

foreach e_PC of varlist e_PC* {
	gen inst_pgi_`e_PC' = ea_new_ukb_ld_1 * `e_PC' if replicant==0
	replace inst_pgi_`e_PC' = ea_new_ukb_ld_0 * `e_PC' if replicant==1
}

reghdfe mainVar instrument $controls3, absorb(newidreal) cluster(famid ID)
testparm instrument
ivreghdfe EA_new (mainVar mainVarint pgi_sex pgi_YoB_* pgi_MoB_* pgi_e_PC* = ///
                 instrument instrumentint inst_pgi_sex inst_pgi_YoB_* inst_pgi_MoB_* inst_pgi_e_PC*) ///
                 firstborn $controls family_size_imp ///
                 i.YoB#firstborn i.MoB#firstborn sex#firstborn c.e_PC*#firstborn, ///
				 absorb(newidreal, res save) cluster(famid ID) 
estimates store fe_oriv_keller
sum __hdfe1__

restore

// Columns 4-7: Mothers & fathers age 
** Parental age reported at the assessment centre, to avoid confusion with the assessment dates, check the reported age at the assessment centre, not the YoB 
sum motherage fatherage if famid!=.
sum c_motherage_0 c_motherage_1 c_motherage_2 c_fatherage_0 c_fatherage_1 c_fatherage_2 
sum h_date_0 h_date_1 h_date_2

gen date_0=date(h_date_0, "DMY")
gen visit_yr0=year(date_0)

gen date_1=date(h_date_1, "DMY")
gen visit_yr1=year(date_1)

gen date_2=date(h_date_2, "DMY")
gen visit_yr2=year(date_2)


** Mother 
gen momYoB_0 = visit_yr0 - c_motherage_0
gen momYoB_1 = visit_yr1 - c_motherage_1
gen momYoB_2 = visit_yr2 - c_motherage_2

gen momYoB = momYoB_0
replace momYoB = momYoB_1 if momYoB_0 ==.
replace momYoB = momYoB_2 if momYoB_0 ==. & momYoB_1 ==.

gen momagebirth=YoB - momYoB
sum momYoB momagebirth


**Father 
gen dadYoB_0 = visit_yr0 - c_fatherage_0
gen dadYoB_1 = visit_yr1 - c_fatherage_1
gen dadYoB_2 = visit_yr2 - c_fatherage_2

gen dadYoB = dadYoB_0
replace dadYoB = dadYoB_1 if dadYoB_0 ==.
replace dadYoB = dadYoB_2 if dadYoB_0 ==. & dadYoB_1 ==.

gen dadagebirth=YoB - dadYoB

sum dadYoB dadagebirth

xtset famid ID
global controls sex i.YoB i.MoB e_PC*
reg EA_new momagebirth
qui eststo mum_age: xtreg  EA_new i.firstborn ea_new_ukb_ld firstborn#c.ea_new_ukb_ld momagebirth $controls if e(sample), cluster(famid) fe
qui eststo mum_age_int: xtreg  EA_new i.firstborn ea_new_ukb_ld firstborn#c.ea_new_ukb_ld momagebirth c.ea_new_ukb_ld#c.momagebirth  $controls if e(sample), cluster(famid) fe
reg EA_new dadagebirth
qui eststo dad_age:   xtreg  EA_new i.firstborn ea_new_ukb_ld firstborn#c.ea_new_ukb_ld dadagebirth $controls if e(sample), cluster(famid) fe
qui eststo dad_age_int: xtreg  EA_new i.firstborn ea_new_ukb_ld firstborn#c.ea_new_ukb_ld dadagebirth c.ea_new_ukb_ld#c.dadagebirth $controls if e(sample), cluster(famid) fe

esttab main fe_keller fe_oriv_keller mum_age mumdad_age_int dad_age dad_age_int, noomitted nobase se star(* 0.10 ** 0.05 *** 0.01) ///
	   stats(r2 N, fmt(3 0)) drop(*.MoB* *.YoB* *e_PC_* pgi*) title("Table E.1. Results of the regressions of years of education on the gene-environment interaction and additional control variables.") ///
	   se(3) b(3) ///
	   addnotes("Robust standard errors in parentheses, clustered by family; Coefficients for the control variables (year and month of birth, gender and the first 40 PCs of the genetic relatedness matrix)" ///
	   "are not displayed, but available upon request from the authors. Column 1 replicates our main results for comparison. Column 2 includes as additional control variables the interactions between firstborn" ///
	   "and the polygenic score with year of birth, month of birth, gender and the first 40 PCs. The direct effect sizes of being firstborn and the polygenic score for years of education are now relative to the" ///
	   "reference categories of the control variables (born in 1937, born in January, being female). Column 3 provides the ORIV estimation of the specification in Column 2. Columns 4 and 6 include controls for" ///
	   "maternal age at birth and paternal age at birth, respectively. Columns 5 and 7 include controls for maternal age at birth and paternal age at birth and interacts them with the polygenic score") 

	   est drop _all 
	   

********************************************************************************
* Table E.2 Results of the regressions of years of education on the 
*           gene-environment interaction by socioeconomic status.
********************************************************************************
merge 1:1 ID using "social_class_at_gid_1951.dta", gen(_class)
drop if _class==2 // drop those outside of our analysis sample 
count //14,850

* Left education at 15 proportional to total (sum of Class1-5)
sum prop_male_left_education_15 


xtset famid ID
global controls sex i.YoB i.MoB e_PC*
qui eststo main: xtreg EA_new i.firstborn ea_new_ukb_ld firstborn#c.ea_new_ukb_ld $controls, cluster(famid) fe
* remove those unmatched 
preserve 
keep if _class==3 // 13,812
* replicate the main analysis in the smaller sample 
qui eststo main_ses: xtreg EA_new i.firstborn ea_new_ukb_ld firstborn#c.ea_new_ukb_ld $controls, cluster(famid) fe
* below and above median high ses as compared to all districts 
tab above_med_left_edu_15
qui eststo above_median: xtreg EA_new i.firstborn ea_new_ukb_ld firstborn#c.ea_new_ukb_ld $controls if above_med_left_edu_15==1, cluster(famid) fe
qui eststo below_median: xtreg EA_new i.firstborn ea_new_ukb_ld firstborn#c.ea_new_ukb_ld $controls if above_med_left_edu_15==0, cluster(famid) fe

esttab main main_ses above_median below_median, ///
      noomitted nobase se star(* 0.10 ** 0.05 *** 0.01) ///
      stats(r2 N, fmt(3 0)) drop(e_PC_* *.MoB *.YoB sex) se(3) b(3) ///
	  title("Table E.2 Results of the regressions of years of education on the gene-environment interaction, stratified by district-level socioeconomic status.") ///
	  mtitles("Main" "SESsample" "low SES" "high SES") ///
	  addnotes("Robust standard errors in parentheses, clustered by family; Coefficients for the control variables (year and month of birth, gender," ///
	  "family size and the first 40 PCs of the genetic relatedness matrix) are not displayed, but available upon request from the authors. SES sample" ///
	  "is the subsample with data on district-level SES. Low SES includes individuals born in districts in which the proportion of males leaving education" ///
	  "at age 15 is above the median based on the 1951 UK Census. High SES includes those born in districts in which the proportion of males leaving education at 15 is below the median.") 
est drop _all
restore 

**********************************************************************
* APPENDIX F
* TABLE F.1. REGRESSIONS OF YEARS OF EDUCATION ON THE GENE-ENVIRONMENT 
*            INTERACTION WITH THE REPOSITORY-BASED POLYGENIC SCORE (PGS)
**********************************************************************
// WARNING: CHANGE IN SAMPLE SIZE!
merge 1:1 ID using "UKB1_PGIrepo_v1.0.dta", gen(_scores1)
preserve
drop if _scores1==2
* Count, 15 obs from our original sample are missing 
* 14,835
count

// select single trait pgs
global pgi pgi_activitysingle pgi_adhdsingle pgi_adventuresingle pgi_afbsingle pgi_asteczrhisingle pgi_asthmasingle pgi_auditsingle pgi_bmisingle pgi_cannabissingle pgi_cpdsingle pgi_cpsingle pgi_depsingle pgi_dpwsingle pgi_easingle pgi_eversmokesingle pgi_extrasingle pgi_famsatsingle pgi_friendsatsingle pgi_hayfeversingle pgi_heightsingle pgi_highmathsingle pgi_leftoutsingle pgi_menarchesingle pgi_migrainesingle pgi_morningsingle pgi_narcissingle pgi_nearsightedsingle pgi_nebwomensingle pgi_neurosingle pgi_opensingle pgi_readingsingle pgi_religattsingle pgi_risksingle pgi_selfhealthsingle pgi_selfmathsingle pgi_swbsingle pgi_adhdmulti pgi_adventuremulti pgi_afbmulti pgi_allergycatmulti pgi_allergydustmulti pgi_allergypollenmulti pgi_asteczrhimulti pgi_asthmamulti pgi_auditmulti pgi_cogempmulti pgi_copdmulti pgi_cpmulti pgi_delaydiscmulti pgi_depmulti pgi_dpwmulti pgi_eamulti pgi_extramulti pgi_famsatmulti pgi_finsatmulti pgi_friendsatmulti pgi_hayfevermulti pgi_highmathmulti pgi_leftoutmulti pgi_lonelymulti pgi_menarchemulti pgi_nebmenmulti pgi_nebwomenmulti pgi_neuromulti pgi_religattmulti pgi_riskmulti pgi_selfhealthmulti pgi_selfmathmulti pgi_swbmulti pgi_voicedeepmulti pgi_worksatmulti

// standardize within the analysis sample 
foreach i in $pgi {
	qui sum `i', detail
	replace `i' = (`i' - r(mean))/r(sd)
}
sum $pgi 

xtset famid ID
global controls sex i.YoB i.MoB e_PC*

qui eststo M2: xtreg EA_new pgi_easingle  $controls, cluster(famid) fe
qui eststo M3: xtreg EA_new pgi_easingle  i.firstborn firstborn#c.pgi_easingle  $controls, cluster(famid) fe
qui eststo M1: xtreg EA_new $controls if e(sample), cluster(famid) fe 
esttab M1 M2 M3, ///
      noomitted nobase se star(* 0.10 ** 0.05 *** 0.01) ///
      stats(r2 N, fmt(3 0)) drop(e_PC_* *.MoB *.YoB sex) se(3) b(3) ///
	  title("Table F.2. Regressions of years of education on the gene-environment interaction with the repository-based polygenic score (PGS).") ///
	  mtitles("No PGS" "With PGS" "GxE") ///
	  addnotes("Robust standard errors in parentheses, clustered by family; Coefficients for the control variables (year and month of birth," ///
	  "gender and the first 40 PCs) are not displayed, but available upon request from the authors; The PGS for years of education in Columns 1-3" ///
	  "is based on the single-trait polygenic score for years of education from the publicly available polygenic index repository (Becker et al. 2021).") 
est drop _all
restore


********************************************************************************
* Figure F.1.: Permutation based distribution of the difference in coefficients 
* for the G×E interplay using the repository PGS vs. the UK Biobank PGS
********************************************************************************
merge 1:1 ID using "UKB1_PGIrepo_v1.0.dta", gen(_scores1)
preserve
drop if _scores1==2
* Count, 15 obs from our original sample are missing 
* 14,835
count

// select ingle trait pgs
global pgi pgi_activitysingle pgi_adhdsingle pgi_adventuresingle pgi_afbsingle pgi_asteczrhisingle pgi_asthmasingle pgi_auditsingle pgi_bmisingle pgi_cannabissingle pgi_cpdsingle pgi_cpsingle pgi_depsingle pgi_dpwsingle pgi_easingle pgi_eversmokesingle pgi_extrasingle pgi_famsatsingle pgi_friendsatsingle pgi_hayfeversingle pgi_heightsingle pgi_highmathsingle pgi_leftoutsingle pgi_menarchesingle pgi_migrainesingle pgi_morningsingle pgi_narcissingle pgi_nearsightedsingle pgi_nebwomensingle pgi_neurosingle pgi_opensingle pgi_readingsingle pgi_religattsingle pgi_risksingle pgi_selfhealthsingle pgi_selfmathsingle pgi_swbsingle pgi_adhdmulti pgi_adventuremulti pgi_afbmulti pgi_allergycatmulti pgi_allergydustmulti pgi_allergypollenmulti pgi_asteczrhimulti pgi_asthmamulti pgi_auditmulti pgi_cogempmulti pgi_copdmulti pgi_cpmulti pgi_delaydiscmulti pgi_depmulti pgi_dpwmulti pgi_eamulti pgi_extramulti pgi_famsatmulti pgi_finsatmulti pgi_friendsatmulti pgi_hayfevermulti pgi_highmathmulti pgi_leftoutmulti pgi_lonelymulti pgi_menarchemulti pgi_nebmenmulti pgi_nebwomenmulti pgi_neuromulti pgi_religattmulti pgi_riskmulti pgi_selfhealthmulti pgi_selfmathmulti pgi_swbmulti pgi_voicedeepmulti pgi_worksatmulti

// standardize within the analysis sample 
foreach i in $pgi {
	qui sum `i', detail
	replace `i' = (`i' - r(mean))/r(sd)
}
sum $pgi 

* define controls 
xtset famid ID
global controls3 sex i.YoB i.MoB e_PC* family_size_imp

capture program drop famfe_seg

program famfe_seg, rclass
    args pgi1 pgi2 firstborn_p
    qui xtreg EA_new c.`firstborn_p'##c.`pgi1' $controls3 if N_famid != 1, fe cluster(famid)
    estimates store seg_first_pgi1
    sca bfirstborn_pgi1 = _b[c.`firstborn_p'#c.`pgi1']

    qui xtreg EA_new c.`firstborn_p'##c.`pgi2' $controls3 if N_famid != 1, fe cluster(famid)
    estimates store seg_first_pgi2
    sca bfirstborn_pgi2 = _b[c.`firstborn_p'#c.`pgi2']

    return scalar diff_fb = bfirstborn_pgi2 - bfirstborn_pgi1
end

*gen id=_n
set seed 1
local n_permfb = 500
mat permfb = J(`n_permfb', 1, 0)

forvalues j = 1/`n_permfb' {
    di `j'
    capture drop u
    capture drop upermvar1 upermvar2 upermvar3
    gen double u = runiform()
    sort u

    // Generate permuted variables
    local type: type ea_new_ukb_ld
    qui generate `type' upermvar1 = ea_new_ukb_ld[id]
    qui generate `type' upermvar2 = pgi_easingle[id]
    local type2: type firstborn
    qui generate `type2' upermvar3 = firstborn[id]

    famfe_seg upermvar1 upermvar2 upermvar3
    mat permfb[`j', 1] = r(diff_fb)
}
drop permfb
svmat permfb

_pctile permfb1, p(2.5 5 95 97.5)

* actual difference is 0.204 (Table 2, column 5) - 0.114 (Table F.1, column 3) = 0.09
twoway (hist permfb1, yscale(range(0 7))) || (pci 0 0.09 7 0.09, lcolor(gs15)) || (pci 0 `r(r1)' 7 `r(r1)', lpattern(dash) lcolor(black)) || /*
    */ (pci 0 `r(r2)' 7 `r(r2)', lpattern(dash) lcolor(black)) || (pci 0 `r(r3)' 7 `r(r3)', lpattern(dash) lcolor(black)) || (pci 0 `r(r4)' 7 `r(r4)', lpattern(dash) lcolor(black)), /*
    */ scheme(s1mono) xtitle("Difference in coefficients (PGI1*firstborn vs PGI2*firstborn)") legend(order(1 2 3) lab(1 "Density") lab(2 "Actual difference") lab(3 "Critical values"))

graph export "Figure_F.1.pdf", as(pdf) name("Graph")
restore

*********************************************************************************
* APPENDIX G
* TABLE G.1. WITHIN-FAMILY REGRESSIONS EXPLAINING SELECTED SINGLE-TRAIT POLYGENIC
*            SCORES FROM THE POLYGENIC SCORE REPOSITORY (BECKER ET AL, 2021) ON 
*            A DUMMY FOR BEING FIRSTBORN 
*********************************************************************************

// WARNING: CHANGE IN SAMPLE SIZE!
merge 1:1 ID using "UKB1_PGIrepo_v1.0.dta", gen(_scores1)
preserve
drop if _scores1==2
* Count, 15 obs from our original sample are missing 
* 14,835
count

// select ingle trait pgs
global pgi pgi_activitysingle pgi_adhdsingle pgi_adventuresingle pgi_afbsingle pgi_asteczrhisingle pgi_asthmasingle pgi_auditsingle pgi_bmisingle pgi_cannabissingle pgi_cpdsingle pgi_cpsingle pgi_depsingle pgi_dpwsingle pgi_easingle pgi_eversmokesingle pgi_extrasingle pgi_famsatsingle pgi_friendsatsingle pgi_hayfeversingle pgi_heightsingle pgi_highmathsingle pgi_leftoutsingle pgi_menarchesingle pgi_migrainesingle pgi_morningsingle pgi_narcissingle pgi_nearsightedsingle pgi_nebwomensingle pgi_neurosingle pgi_opensingle pgi_readingsingle pgi_religattsingle pgi_risksingle pgi_selfhealthsingle pgi_selfmathsingle pgi_swbsingle pgi_adhdmulti pgi_adventuremulti pgi_afbmulti pgi_allergycatmulti pgi_allergydustmulti pgi_allergypollenmulti pgi_asteczrhimulti pgi_asthmamulti pgi_auditmulti pgi_cogempmulti pgi_copdmulti pgi_cpmulti pgi_delaydiscmulti pgi_depmulti pgi_dpwmulti pgi_eamulti pgi_extramulti pgi_famsatmulti pgi_finsatmulti pgi_friendsatmulti pgi_hayfevermulti pgi_highmathmulti pgi_leftoutmulti pgi_lonelymulti pgi_menarchemulti pgi_nebmenmulti pgi_nebwomenmulti pgi_neuromulti pgi_religattmulti pgi_riskmulti pgi_selfhealthmulti pgi_selfmathmulti pgi_swbmulti pgi_voicedeepmulti pgi_worksatmulti

// standardize within the analysis sample 
foreach i in $pgi {
	qui sum `i', detail
	replace `i' = (`i' - r(mean))/r(sd)
}
sum $pgi 

// regress by correcting p-value for multiple hypothesis testing 
xtset famid ID
global controls sex i.YoB i.MoB e_PC* 

foreach i in $pgi {
	qui eststo M`i': xtreg `i' firstborn $controls, cluster(famid) fe
} 

/* 
Anthropometric:
BMI             : Body Mass Index
HEIGHT          : Height
*/
esttab Mpgi_bmisingle Mpgi_heightsingle, ///
       noomitted nobase se ///
       stats(r2 N, fmt(3 0)) drop(e_PC* *.MoB *.YoB sex) b(3) se(3) ///
	   addnotes("Excludes non-Europeans" "Controls for year and month of birth, gender, 40 first PCs" "PGS is standardized") 


/*
Health and Health Behaviours: 
AUDIT           : Alcohol Misuse
ALLERGYCAT      : Allergy - Cat	- not presrnt 
ALLERGYDUST     : Allergy - Dust - not 
ALLERGYPOLLEN   : Allergy - Pollen - na
ASTECZRHI       : Asthma/Eczema/Rhinitis	
ASTHMA          : Asthma
ADHD            : Attention Deficit Hyperactivity Disorder
CANNABIS        : Cannabis Use
CPD             : Cigarettes per Day
COPD            : Chronic Obstructive Pulmonary Disease - not
DEP             : Depressive Symptoms
DPW             : Drinks per Week
EVERSMOKE       : Ever Smoker
HAYFEVER        : Hayfever (Allergic Rhinitis)
MIGRAINE        : Migraine
NEARSIGHTED     : Nearsightedness
ACTIVITY        : Physical Activity
SELFHEALTH      : Self-Rated Health 
*/
esttab Mpgi_auditsingle Mpgi_asthmasingle Mpgi_asteczrhisingle  Mpgi_adhdsingle Mpgi_cannabissingle Mpgi_cpdsingle Mpgi_depsingle, ///
       noomitted nobase se ///
       stats(r2 N, fmt(3 0)) drop(e_PC* *.MoB *.YoB sex) b(3) se(3) ///
	   addnotes("Excludes non-Europeans" "Controls for year and month of birth, gender, 40 first PCs" "PGS is standardized") 

esttab Mpgi_dpwsingle Mpgi_eversmokesingle Mpgi_hayfeversingle Mpgi_migrainesingle Mpgi_nearsightedsingle Mpgi_activitysingle Mpgi_selfhealthsingle, ///
       noomitted nobase se ///
       stats(r2 N, fmt(3 0)) drop(e_PC* *.MoB *.YoB sex) b(3) se(3) ///
	   addnotes("Excludes non-Europeans" "Controls for year and month of birth, gender, 40 first PCs" "PGS is standardized") 
	   
/*
Personality and well-being:
ADVENTURE       : Adventurousness
COGEMP          : Cognitive Empathy	not present
DELAYDISC       : Delay Discounting - not present 
EXTRA           : Exraversion 
LEFTOUT         : Left Out of Social Activity
FAMSAT          : Life Satisfaction - Family
FINSAT          : Life Satisfaction - Finance - not present 
FRIENDSAT       : Life Satisfaction - Friend
WORKSAT         : Life Satisfaction - Work - not present 
LONELY          : Loneliness - not present 
MORNING         : Morning person - 
Narciss         : Narcissism 
NEURO           : Neuroticism
OPEN            : Openness
RELIGATT        : Religious Attendance 
RISK            : Risk Tolerance
SWB             : Subjective well-being
*/

esttab Mpgi_adventuresingle Mpgi_extrasingle Mpgi_leftoutsingle Mpgi_famsatsingle Mpgi_friendsatsingle Mpgi_morningsingle, ///
       noomitted nobase se ///
       stats(r2 N, fmt(3 0)) drop(e_PC* *.MoB *.YoB sex) b(3) se(3) ///
	   addnotes("Excludes non-Europeans" "Controls for year and month of birth, gender, 40 first PCs" "PGS is standardized") 

esttab Mpgi_narcissingle Mpgi_neurosingle Mpgi_opensingle Mpgi_religattsingle Mpgi_risksingle Mpgi_swbsingle, ///
       noomitted nobase se ///
       stats(r2 N, fmt(3 0)) drop(e_PC* *.MoB *.YoB sex) b(3) se(3) ///
	   addnotes("Excludes non-Europeans" "Controls for year and month of birth, gender, 40 first PCs" "PGS is standardized") 
	   

*********************************************************************************
* APPENDIX H: Interaction terms in family fixed effects models
*********************************************************************************	   
clear all
capture log close	

cd "projectfolder"
use "analysis_data", clear
merge 1:1 ID using "PGS_ldpred_plink_EA_height_cvd_bmi.dta", gen(_scores)
keep if _scores==3

reg EA_new firstborn ea_new_ukb_ld ea_new_23me_ukb_ld rank family_size last_child sex
keep if e(sample)

egen N_famid_new=count(famid) if famid!=., by(famid)
tab N_famid_new
drop if N_famid_new==1
count

global pgs_new ea_new_ukb_ld ea_new_ukb_ld_0 ea_new_ukb_ld_1 ea_new_23me_ukb_ld
foreach i in $pgs_new {
	qui sum `i', detail
	replace `i' = (`i' - r(mean))/r(sd)
}


global controls0 sex i.YoB i.MoB e_PC*
global controls1 sex i.YoB i.MoB e_PC* last_child 
global controls2 sex i.YoB i.MoB e_PC* last_child family_size_imp
global controls3 sex i.YoB i.MoB e_PC* family_size_imp

* BETWEEN FAMILY
reg EA_new i.firstborn ea_new_ukb_ld firstborn#c.ea_new_ukb_ld $controls3 if N_famid!=1, cluster(famid)
estimates store bf

reg ea_new_ukb_ld_0 ea_new_ukb_ld_1
sca corrscores = _b[ea_new_ukb_ld_1]
qui replace ea_new_ukb_ld_0 = ea_new_ukb_ld_0/sqrt(corrscores)
qui replace ea_new_ukb_ld_1 = ea_new_ukb_ld_1/sqrt(corrscores) 

preserve
expand 2, generate(replicant)
gen mainVar = ea_new_ukb_ld_0 if replicant == 0
gen mainVarint = firstborn * ea_new_ukb_ld_0 if replicant==0
replace mainVar = ea_new_ukb_ld_1 if replicant == 1
replace mainVarint = firstborn * ea_new_ukb_ld_1 if replicant==1
gen instrument = ea_new_ukb_ld_1 if replicant == 0
gen instrumentint = firstborn * ea_new_ukb_ld_1 if replicant==0
replace instrument = ea_new_ukb_ld_0 if replicant == 1
replace instrumentint = firstborn * ea_new_ukb_ld_0 if replicant==1
ivreghdfe EA_new (mainVar mainVarint = instrument instrumentint) i.firstborn $controls3, absorb(replicant, save) cluster(famid ID)
estimates store bf_iv
restore

* WITHIN FAMILY
*BASELINE REGRESSION FROM PAPER
xtreg EA_new i.firstborn ea_new_ukb_ld firstborn#c.ea_new_ukb_ld $controls3 if N_famid!=1, cluster(famid) fe
estimates store wf
*xtreg  educ_years i.firstborn ea_new_ukb_ld firstborn#c.ea_new_ukb_ld $controls3 if N_famid!=1, cluster(famid) fe

preserve
expand 2, generate(replicant)
egen newid = concat(famid replicant)
gen long newidreal = real(newid)
xtset newidreal
gen mainVar = ea_new_ukb_ld_0  if replicant == 0
gen mainVarint = firstborn * ea_new_ukb_ld_0  if replicant==0
replace mainVar = ea_new_ukb_ld_1 if replicant == 1
replace mainVarint = firstborn * ea_new_ukb_ld_1 if replicant==1
gen instrument = ea_new_ukb_ld_1 if replicant == 0
gen instrumentint = firstborn * ea_new_ukb_ld_1 if replicant==0
replace instrument = ea_new_ukb_ld_0 if replicant == 1
replace instrumentint = firstborn * ea_new_ukb_ld_0 if replicant==1
ivreghdfe EA_new (mainVar mainVarint = instrument instrumentint) i.firstborn $controls3, absorb(newidreal, res save) cluster(famid ID) 
estimates store wf_iv
restore

twoway (function y = 12.184+0.428 + 0.757*x + 0.285*x, range(-2 2) alp(dot) fi(inten0) acol(gs10) alwidth(medthick) lc(black) lp("l")) ///
	   (function y = 12.184+ 0.757*x, range(-2 2) alp(dot) fi(inten0) acol(gs10) alwidth(medthick) lc(black) lp("-")), ///
	   legend(rows(1) lab(1 "First-borns") lab(2 "Later-borns")) xlabel(-2(1)2) ylabel(11(1)15) ///
	   xtitle("PGI for Years of Education") ytitle("Years of Education") scheme(s1mono)

esttab bf* wf*, noomitted nobase se star(* 0.10 ** 0.05 *** 0.01) ///
	   stats(N r2) drop(e_PC_* *.MoB *.YoB sex family_size_imp) title("Comparison of Between and Within Family results") ///
	   se(3) b(3) ///
	   mtitles("OLS BF" "ORIV BF" "OLS WF" "ORIV WF") ///
	   addnotes("Excludes non-Europeans" "Controls for year and month of birth, being last child, gender and 40 first PCs" "PGSs are standardized") 
	   

*CONSTRUCTING FAMILY DEVIATIONS
bys famid: egen meanbo = mean(firstborn)
gen famdevbo = firstborn - meanbo
bys famid: egen meanpgi = mean(ea_new_ukb_ld)
gen famdevpgi = ea_new_ukb_ld-meanpgi
bys famid: egen meanEA = mean(EA_new)
gen delta_EA = EA_new - meanEA

xtreg EA_new ea_new_ukb_ld firstborn, fe
estimates store m1
reg EA_new famdevpgi famdevbo
estimates store m2
reg EA_new ea_new_ukb_ld meanpgi firstborn meanbo
estimates store m3
reg delta_EA famdevpgi famdevbo
reg delta_EA ea_new_ukb_ld meanpgi firstborn meanbo
estimates store m4

esttab m*, se star(* 0.10 ** 0.05 *** 0.01) stats(N r2) se(3) b(3) 

*THIS GIVES ALL THE SAME COEFFICIENTS, BUT THE POWER IS DIFFERENT
*xtreg EA_new ea_new_ukb_ld, fe
*reg EA_new famdevpgi
*reg EA_new ea_new_ukb_ld meanpgi
*reg delta_EA famdevpgi
*reg delta_EA ea_new_ukb_ld meanpgi

gen product = firstborn*ea_new_ukb_ld
bys famid: egen meanproduct = mean(product)
gen famdevproduct = product-meanproduct

gen pgimubo = ea_new_ukb_ld*meanbo
gen bomupgi = firstborn*meanpgi

xtreg EA_new ea_new_ukb_ld firstborn c.firstborn#c.ea_new_ukb_ld if e(sample), fe robust
estimates store m5
reg EA_new famdevpgi famdevbo c.famdevpgi#c.famdevbo if e(sample), robust
estimates store m6
reg EA_new famdevpgi famdevbo famdevproduct if e(sample), robust
estimates store m7
reg EA_new ea_new_ukb_ld meanpgi firstborn meanbo c.firstborn#c.ea_new_ukb_ld meanproduct if e(sample), robust
estimates store m8

esttab m5 m7 m8, se star(* 0.10 ** 0.05 *** 0.01) stats(N r2) se(3) b(3) mtitles("Eq (5)" "Eq (6)" "Eq (7)") 

* WITHIN FAMILY SEGMENTED REGRESSIONS
*BY FIRSTBORN
reg EA_new ea_new_ukb_ld meanpgi $controls3 if N_famid!=1 & firstborn==0, cluster(famid) 
estimates store seg_pgi_later
reg EA_new ea_new_ukb_ld meanpgi $controls3 if N_famid!=1 & firstborn==1, cluster(famid) 
estimates store seg_pgi_first

reg EA_new ea_new_ukb_ld meanpgi $controls3 if N_famid!=1 & firstborn==0
estimates store suest1
reg EA_new ea_new_ukb_ld meanpgi $controls3 if N_famid!=1 & firstborn==1
estimates store suest2
suest suest1 suest2, cluster(famid)
test [suest1_mean]ea_new_ukb_ld = [suest2_mean]ea_new_ukb_ld

bys famid: egen meanpgi0 = mean(ea_new_ukb_ld_0)
bys famid: egen meanpgi1 = mean(ea_new_ukb_ld_1)

preserve
expand 2, generate(replicant)
gen mainVar = ea_new_ukb_ld_0 if replicant == 0
replace mainVar = ea_new_ukb_ld_1 if replicant == 1
gen instrument = ea_new_ukb_ld_1 if replicant == 0
replace instrument = ea_new_ukb_ld_0 if replicant == 1
gen mean = meanpgi0 if replicant==0
replace mean = meanpgi1 if replicant==1
gen IVmean = meanpgi1 if replicant==0
replace IVmean = meanpgi0 if replicant==1
ivreghdfe EA_new (mainVar mean = instrument IVmean) $controls3 if firstborn==0, absorb(replicant, save) cluster(famid ID)
estimates store seg_pgi_oriv_later
ivreghdfe EA_new (mainVar mean = instrument IVmean) $controls3 if firstborn==1, absorb(replicant, save) cluster(famid ID)
estimates store seg_pgi_oriv_first
restore

esttab seg_pgi_later seg_pgi_first seg_pgi_oriv*, noomitted nobase se star(* 0.10 ** 0.05 *** 0.01) ///
	   stats(N r2) drop(e_PC_* *.MoB *.YoB sex family_size_imp meanpgi mean) title("Segmented regressions by birth order") ///
	   se(3) b(3) ///
	   mtitles("OLS Later" "OLS First" "ORIV Later" "ORIV First") ///
	   addnotes("Excludes non-Europeans" "Controls for year and month of birth, being last child, gender and 40 first PCs" "PGSs are standardized")
	   
*BY QUARTILES OF THE PGI
xtile PGI = ea_new_ukb_ld, nq(4)

forvalues i=1/4 {
	xtreg EA_new firstborn $controls3 if N_famid!=1 & PGI==`i', fe cluster(famid) 
	estimates store seg_first_q`i'
}

esttab seg_first*, noomitted nobase se star(* 0.10 ** 0.05 *** 0.01) ///
	   stats(N r2) drop(e_PC_* *.MoB *.YoB sex) title("Segmented regressions by quartile of the PGI") ///
	   se(3) b(3) ///
	   mtitles("Q1" "Q2" "Q3" "Q4")
	   
esttab seg_first*, noomitted nobase se star(* 0.10 ** 0.05 *** 0.01) ///
	   stats(N r2) drop(e_PC_* *.MoB *.YoB sex) title("Segmented regressions by quartile of the PGI") ///
	   se(3) b(3) ///
	   mtitles("Q1" "Q2" "Q3" "Q4")
	   
xtreg EA_new firstborn $controls3 if N_famid!=1 & PGI==1|PGI==2, fe cluster(famid) 
estimates store seg_first_q12

xtreg EA_new firstborn $controls3 if N_famid!=1 & PGI==3|PGI==4, fe cluster(famid) 
estimates store seg_first_q34

reg EA_new famdevbo $controls3 if N_famid!=1 & PGI==1|PGI==2, cluster(famid)
estimates store seg_first_q12d
reg EA_new famdevbo $controls3 if N_famid!=1 & PGI==3|PGI==4, cluster(famid)
estimates store seg_first_q34d

esttab seg_first_q12 seg_first_q34 seg_first_q12d seg_first_q34d, noomitted nobase se star(* 0.10 ** 0.05 *** 0.01) ///
	   stats(N r2) drop(e_PC_* *.MoB *.YoB sex family_size_imp) title("Segmented regressions by above- and below mean PGI") ///
	   se(3) b(3) ///
	   mtitles("Below" "Above" "Below" "Above")
	   
BREAK
	   

* PERMUTATION INFERENCE


capture program drop oriv
program oriv, rclass
args pgi1 pgi2 firstborn_p
preserve
drop meanpgi0 meanpgi1
qui expand 2, generate(replicant)
qui gen mainVar = `pgi1' if replicant == 0
qui replace mainVar = `pgi2' if replicant == 1
qui gen instrument = `pgi2' if replicant == 0
qui replace instrument = `pgi1' if replicant == 1
qui bys famid: egen meanpgi0 = mean(`pgi1')
qui bys famid: egen meanpgi1 = mean(`pgi2')
qui gen mean = meanpgi0 if replicant==0
qui replace mean = meanpgi1 if replicant==1
qui gen IVmean = meanpgi1 if replicant==0
qui replace IVmean = meanpgi0 if replicant==1
qui ivreghdfe EA_new (mainVar mean = instrument IVmean) $controls3 if `firstborn_p'==1, absorb(replicant, save) cluster(famid ID)
estimates store seg_pgi_oriv_later
sca bfirstborn = _b[mainVar]
qui ivreghdfe EA_new (mainVar mean = instrument IVmean) $controls3 if `firstborn_p'==0, absorb(replicant, save) cluster(famid ID)
estimates store seg_pgi_oriv_first
sca blaterborn = _b[mainVar]
return scalar diff = bfirstborn - blaterborn
restore
end

*permute ea_new_ukb_ld_0 ea_new_ukb_ld_1 firstborn r(diff), reps(10): oriv ea_new_ukb_ld_0 ea_new_ukb_ld_1
*hist coef, addplot(pci 0 0.16 4 0.16) scheme(s1mono) xtitle("Coefficient interaction term") legend(off)

*The Stata Journal (2010)
*10, Number 4, pp. 686–688
*Stata tip 92: Manual implementation of permutations
*and bootstraps
*Lars Angquist
gen id=_n
set seed 1
local n_perm = 500
mat perm = J(`n_perm',1,0)
forvalues j = 1 / `n_perm' {
di `j'
capture drop u 
capture drop upermvar*
gen double u=runiform()
sort u
local type: type ea_new_ukb_ld_0
qui generate `type' upermvar1 = ea_new_ukb_ld_0[id]
local type2: type ea_new_ukb_ld_1
qui generate `type2' upermvar2 = ea_new_ukb_ld_1[id]
drop u
gen double u=runiform()
sort u
local type3: type firstborn
qui generate `type3' upermvar3 = firstborn[id]
oriv upermvar1 upermvar2 upermvar3
mat perm[`j',1] = r(diff)
}
svmat perm 
_pctile perm1, p(2.5 5 95 97.5)
twoway (hist perm1) || (pci 0 0.489 3 0.489) || (pci 0 `r(r1)' 3 `r(r1)', lpattern(dash) lcolor(black)) || (pci 0 `r(r2)' 3 `r(r2)', lpattern(dash) lcolor(black)) || /*
	*/ (pci 0 `r(r3)' 3 `r(r3)', lpattern(dash) lcolor(black)) || (pci 0 `r(r4)' 3 `r(r4)', lpattern(dash) lcolor(black)), /*
	*/ scheme(s1mono) xtitle("Difference in coefficients") legend(order(1 2 3) lab(1 "Density") lab(2 "Actual difference") lab(3 "Critical values"))
	
BREAK

capture program drop famfe_seg
program famfe_seg, rclass
capture drop PGI
args pgi firstborn_p
xtile PGI = `pgi', nq(2)
qui xtreg EA_new `firstborn_p' $controls3 if N_famid!=1 & PGI==1, fe cluster(famid) 
estimates store seg_first_q1
sca bfirstborn1 = _b[`firstborn_p']

qui xtreg EA_new `firstborn_p' $controls3 if N_famid!=1 & PGI==2, fe cluster(famid) 
estimates store seg_first_q2
sca bfirstborn2 = _b[`firstborn_p']

return scalar diff_fb = bfirstborn2 - bfirstborn1
end

local n_permfb = 500
mat permfb = J(`n_permfb',1,0)
forvalues j = 1/`n_permfb' {
	di `j'
	capture drop u
	capture drop upermvar*
	gen double u = runiform()
	sort u
	local type: type ea_new_ukb_ld
	qui generate `type' upermvar1 = ea_new_ukb_ld[id]
	local type2: type firstborn
	qui generate `type2' upermvar2 = firstborn[id]
	famfe_seg upermvar1 upermvar2
	mat permfb[`j',1]=r(diff_fb)
}

svmat permfb
_pctile permfb1, p(2.5 5 95 97.5)
twoway (hist permfb1)|| (pci 0 0.47 2 0.47) || (pci 0 `r(r1)' 2 `r(r1)', lpattern(dash) lcolor(black)) || (pci 0 `r(r2)' 2 `r(r2)', lpattern(dash) lcolor(black)) || /*
	*/ (pci 0 `r(r3)' 2 `r(r3)', lpattern(dash) lcolor(black)) || (pci 0 `r(r4)' 2 `r(r4)', lpattern(dash) lcolor(black)), /*
	*/ scheme(s1mono) xtitle("Difference in coefficients")legend(order(1 2 3) lab(1 "Density") lab(2 "Actual difference") lab(3 "Critical values"))



* GIESSELMANN AND SCHMIDT-CATRAN (2020) - DOUBLE DEMEANED FE

gen productfamdev = famdevpgi*famdevbo
bys famid: egen fammeanproduct = mean(productfamdev)
gen ddproduct = productfamdev - fammeanproduct

xtreg EA_new ea_new_ukb_ld firstborn productfamdev $controls3, fe cluster(famid)
reg EA_new famdevpgi famdevbo ddproduct $controls3, cluster(famid)

reg delta_EA famdevpgi famdevbo c.famdevpgi#c.famdevbo if e(sample), robust

xtreg EA_new ea_new_ukb_ld firstborn c.firstborn#c.ea_new_ukb_ld $controls3 if e(sample), fe robust
reg EA_new famdevpgi famdevbo c.famdevpgi#c.famdevbo $controls3 if e(sample), robust
reg EA_new ea_new_ukb_ld meanpgi firstborn meanbo firstborn#c.ea_new_ukb_ld meanproduct $controls3 if e(sample), robust
reg EA_new ea_new_ukb_ld meanpgi firstborn meanbo firstborn#c.ea_new_ukb_ld c.ea_new_ukb_ld#c.meanbo c.firstborn#c.meanpgi c.meanpgi#c.meanbo $controls3 if e(sample), robust
reg delta_EA famdevpgi famdevbo c.famdevpgi#c.famdevbo $controls3 if e(sample), robust
*reg educ_years famdevpgi famdevbo c.famdevpgi#c.famdevbo $controls3 if e(sample), robust

bys famid: egen meanpgi0 = mean(ea_new_ukb_ld_0)
bys famid: egen meanpgi1 = mean(ea_new_ukb_ld_1)
gen famdevpgi0 = ea_new_ukb_ld_0 - meanpgi0
gen famdevpgi1 = ea_new_ukb_ld_1 - meanpgi1

*ORIV WITH TWO UKB SCORES
preserve
expand 2, generate(replicant)
gen mainVar = famdevpgi0 if replicant == 0
gen mainVarint = famdevbo * famdevpgi0 if replicant==0
replace mainVar = famdevpgi1 if replicant == 1
replace mainVarint = famdevbo * famdevpgi1 if replicant==1
gen instrument = famdevpgi1 if replicant == 0
gen instrumentint = famdevbo * famdevpgi1 if replicant==0
replace instrument = famdevpgi0 if replicant == 1
replace instrumentint = famdevbo * famdevpgi0 if replicant==1
ivreghdfe EA_new (mainVar mainVarint = instrument instrumentint) famdevbo $controls3, absorb(replicant, save) cluster(famid ID)
estimates store bf_iv
restore

preserve
expand 2, generate(replicant)
gen mainVar = famdevpgi0 if replicant == 0
gen mainVarint = firstborn * famdevpgi0 if replicant==0
replace mainVar = famdevpgi1 if replicant == 1
replace mainVarint = firstborn * famdevpgi1 if replicant==1
gen instrument = famdevpgi1 if replicant == 0
gen instrumentint = firstborn * famdevpgi1 if replicant==0
replace instrument = famdevpgi0 if replicant == 1
replace instrumentint = firstborn * famdevpgi0 if replicant==1
ivreghdfe EA_new (mainVar mainVarint = instrument instrumentint) firstborn $controls3, absorb(replicant, save) cluster(famid ID)
estimates store bf_iv
restore

************************************************************************************
* APPENDIX I: CONTROLLING FOR IMPUTATION-BASED PARENTAL PGS
************************************************************************************

clear all
capture log close	

cd "projectfolder"

use "analysis_data_organized", clear

qui reg EA_new firstborn ea_new_ukb_ld ea_new_23me_ukb_ld rank family_size last_child sex
keep if e(sample)

egen N_famid_new=count(famid) if famid!=., by(famid)
tab N_famid_new
drop if N_famid_new==1
count

* keep these scores to compare 
global pgs_new ea_new_ukb_ld ea_new_ukb_ld_0 ea_new_ukb_ld_1 ea_new_23me_ukb_ld
foreach i in $pgs_new {
	qui sum `i', detail
	replace `i' = (`i' - r(mean))/r(sd)
}
  
// 14,850 

* merge snipar pgs

merge 1:1 ID using "parental_pgs.dta", gen(_ids)
* 14,850 matched 
keep if _ids==3

* standardize pgs from snipar
global pgs_snipar ea_ukb_ld_1_proband ea_ukb_ld_1_paternal ea_ukb_ld_1_maternal ea_ukb_ld_0_proband ea_ukb_ld_0_paternal ea_ukb_ld_0_maternal ea_ukb_ld_proband ea_ukb_ld_paternal ea_ukb_ld_maternal

foreach i in $pgs_snipar {
	qui sum `i', detail
	replace `i' = (`i' - r(mean))/r(sd)
}

* compare to pgs used in the paper 
corr ea_new_ukb_ld_0    ea_ukb_ld_0_proband     ea_ukb_ld_0_paternal    ea_ukb_ld_0_maternal // correlations are negative, change signs 
corr ea_new_ukb_ld_1    ea_ukb_ld_1_proband     ea_ukb_ld_1_paternal    ea_ukb_ld_1_maternal
corr ea_new_ukb_ld      ea_ukb_ld_proband       ea_ukb_ld_paternal      ea_ukb_ld_maternal
count 

* change signs 
foreach i in $pgs_snipar {
	replace `i' = (-1)*`i'
}


reg ea_new_ukb_ld  ea_ukb_ld_proband ea_ukb_ld_paternal ea_ukb_ld_maternal
reg ea_new_ukb_ld  ea_ukb_ld_proband 
* gxe 
* check replicability of baseline results with snipar pgs 
xtset famid ID
global controls sex i.YoB i.MoB e_PC* family_size_imp
xtreg  EA_new i.firstborn ea_ukb_ld_proband firstborn#c.ea_ukb_ld_proband $controls, cluster(famid) fe
// results are consistent

********************************************************************************
* Table I.1. Regressions of years of education on the gene-environment interaction 
* controlling for imputation-based parental PGSs.
********************************************************************************
// Additional scores used for ORIV: ea_new_ukb_ld_0 ea_new_ukb_ld_1
// control variables 
xtset famid ID
global controls sex i.YoB i.MoB e_PC* family_size_imp

// Column 5: ols without interaction between fam with parental pgs 
qui eststo ols_bf_parents:     reg EA_new i.firstborn ea_ukb_ld_proband firstborn#c.ea_ukb_ld_proband ea_ukb_ld_paternal ea_ukb_ld_maternal  $controls, cluster(famid)
// Column 7: ols with interaction between fam with parental pgs 
qui eststo ols_bf_parents_int: reg EA_new i.firstborn ea_ukb_ld_proband firstborn#c.ea_ukb_ld_proband ea_ukb_ld_paternal ea_ukb_ld_maternal firstborn#c.ea_ukb_ld_paternal firstborn#c.ea_ukb_ld_maternal $controls, cluster(famid)

// Column 6: ORIV 
// ea_ukb_ld_1_proband     ea_ukb_ld_1_paternal    ea_ukb_ld_1_maternal
// ea_ukb_ld_0_proband     ea_ukb_ld_0_paternal    ea_ukb_ld_0_maternal

** compute correlation between proband pgs to determine the scaling factor
reg ea_ukb_ld_0_proband ea_ukb_ld_1_proband 
sca corrscores = _b[ea_ukb_ld_1_proband]
qui replace ea_ukb_ld_0_proband  = ea_ukb_ld_0_proband/sqrt(corrscores)
qui replace ea_ukb_ld_1_proband  = ea_ukb_ld_1_proband/sqrt(corrscores)

** Create the interaction vars 
gen firstborn_0=firstborn*ea_ukb_ld_0_proband 
gen firstborn_1=firstborn*ea_ukb_ld_1_proband 

** compute correlation between parents pgs to determine the scaling factor
reg ea_ukb_ld_0_paternal ea_ukb_ld_1_paternal
sca corrscores = _b[ea_ukb_ld_1_paternal]
qui replace ea_ukb_ld_0_paternal  = ea_ukb_ld_0_paternal/sqrt(corrscores)
qui replace ea_ukb_ld_1_paternal  = ea_ukb_ld_1_paternal/sqrt(corrscores)

reg ea_ukb_ld_0_maternal ea_ukb_ld_1_maternal
sca corrscores = _b[ea_ukb_ld_1_maternal]
qui replace ea_ukb_ld_0_maternal  = ea_ukb_ld_0_maternal/sqrt(corrscores)
qui replace ea_ukb_ld_1_maternal  = ea_ukb_ld_1_maternal/sqrt(corrscores)


// Column 6: ORIV without interaction between parental genes and firstborn
preserve
expand 2, generate(replicant)
gen mainVar = ea_ukb_ld_0_proband if replicant == 0
gen mainVarint = firstborn * ea_ukb_ld_0_proband if replicant==0
replace mainVar = ea_ukb_ld_1_proband if replicant == 1
replace mainVarint = firstborn * ea_ukb_ld_1_proband if replicant==1
gen instrument = ea_ukb_ld_1_proband if replicant == 0
gen instrumentint = firstborn * ea_ukb_ld_1_proband if replicant==0
replace instrument = ea_ukb_ld_0_proband if replicant == 1
replace instrumentint = firstborn * ea_ukb_ld_0_proband if replicant==1

* instrument parental pgs 
gen mainVarmom = ea_ukb_ld_0_maternal if replicant == 0
replace mainVarmom = ea_ukb_ld_1_maternal if replicant == 1
gen instrumentmom = ea_ukb_ld_1_maternal if replicant == 0
replace instrumentmom = ea_ukb_ld_0_maternal if replicant == 1

gen mainVardad = ea_ukb_ld_0_paternal if replicant == 0
replace mainVardad = ea_ukb_ld_1_paternal if replicant == 1
gen instrumentdad = ea_ukb_ld_1_paternal if replicant == 0
replace instrumentdad = ea_ukb_ld_0_paternal if replicant == 1

reghdfe mainVar instrument $controls, absorb(replicant) cluster(famid ID)
testparm instrument

ivreghdfe EA_new (mainVar mainVarint mainVarmom mainVardad = instrument instrumentint instrumentmom instrumentdad) firstborn $controls, absorb(replicant, save) cluster(famid ID)
estimates store bf_iv
sum __hdfe1__
*THE BELOW IS TO GET THE RIGHT CONSTANT TERM FOR THE TABLE
forvalues x=0/1 {
	qui gen constant`x' = replicant == `x'
	}
ivreghdfe EA_new (mainVar mainVarint mainVarmom mainVardad = instrument instrumentint instrumentmom instrumentdad) firstborn $controls constant*, cluster(famid ID)
restore

// Column 8: ORIV with parental PGS and interactions
preserve
expand 2, generate(replicant)
* Proband
gen mainVar = ea_ukb_ld_0_proband if replicant == 0
gen mainVarint = firstborn * ea_ukb_ld_0_proband if replicant==0
replace mainVar = ea_ukb_ld_1_proband if replicant == 1
replace mainVarint = firstborn * ea_ukb_ld_1_proband if replicant==1
gen instrument = ea_ukb_ld_1_proband if replicant == 0
gen instrumentint = firstborn * ea_ukb_ld_1_proband if replicant==0
replace instrument = ea_ukb_ld_0_proband if replicant == 1
replace instrumentint = firstborn * ea_ukb_ld_0_proband if replicant==1

* Mom
gen mainVarmom = ea_ukb_ld_0_maternal if replicant == 0
gen mainVarintmom = firstborn * ea_ukb_ld_0_maternal if replicant==0
replace mainVarmom = ea_ukb_ld_1_maternal if replicant == 1
replace mainVarintmom = firstborn * ea_ukb_ld_1_maternal if replicant==1
gen instrumentmom = ea_ukb_ld_1_maternal if replicant == 0
gen instrumentintmom = firstborn * ea_ukb_ld_1_maternal if replicant==0
replace instrumentmom = ea_ukb_ld_0_maternal if replicant == 1
replace instrumentintmom= firstborn * ea_ukb_ld_0_maternal if replicant==1

* Dad 
gen mainVardad = ea_ukb_ld_0_paternal if replicant == 0
gen mainVarintdad = firstborn * ea_ukb_ld_0_paternal if replicant==0
replace mainVardad = ea_ukb_ld_1_paternal if replicant == 1
replace mainVarintdad = firstborn * ea_ukb_ld_1_paternal if replicant==1
gen instrumentdad = ea_ukb_ld_1_paternal if replicant == 0
gen instrumentintdad = firstborn * ea_ukb_ld_1_paternal if replicant==0
replace instrumentdad = ea_ukb_ld_0_paternal if replicant == 1
replace instrumentintdad = firstborn * ea_ukb_ld_0_paternal if replicant==1


reghdfe mainVar instrument $controls, absorb(replicant) cluster(famid ID)
testparm instrument

ivreghdfe EA_new (mainVar mainVarint mainVarmom mainVarintmom mainVardad mainVarintdad = instrument instrumentint instrumentmom instrumentintmom instrumentdad instrumentintdad) firstborn $controls, absorb(replicant, save) cluster(famid ID)
estimates store bf_iv_par_int
sum __hdfe1__
*THE BELOW IS TO GET THE RIGHT CONSTANT TERM FOR THE TABLE
forvalues x=0/1 {
	qui gen constant`x' = replicant == `x'
	}
ivreghdfe EA_new (mainVar mainVarint mainVarmom mainVarintmom mainVardad mainVarintdad = instrument instrumentint instrumentmom instrumentintmom instrumentdad instrumentintdad) firstborn $controls constant*, cluster(famid ID)
restore

// Compile the table, add the main base results from Table 2 in the main text 
esttab ols_bf_parents bf_iv_par ols_bf_parents_int bf_iv_par_int, noomitted nobase se star(* 0.10 ** 0.05 *** 0.01) ///
	   stats(N r2) drop(e_PC_* *.MoB *.YoB sex family_size_imp ) title("Comparison of ORIV results to OLS") ///
	   se(3) b(3) ///
	   mtitles("OLS Between-family" "OLS Between-family" "ORIV Between-family" "ORIV Between-family") ///
	   addnotes("Excludes non-Europeans" "Controls for year and month of birth, being last child, gender and 40 first PCs" "PGSs are standardized") 
